# !pip install geopy
import pandas as pd # for handling tabular data
import numpy as np # for linear algebra
# for visualization
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from scipy.stats import skew
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
# for machine learning
from scipy.special import softmax
from sklearn.impute import KNNImputer
import warnings
# Ignore all warnings
warnings.filterwarnings('ignore')
from wordcloud import WordCloud
from geopy.distance import great_circle
from sklearn.cluster import KMeans
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
df = pd.read_csv('Dataset\Homestays_Data(in).csv')
df.head()
| id | log_price | property_type | room_type | amenities | accommodates | bathrooms | bed_type | cancellation_policy | cleaning_fee | ... | latitude | longitude | name | neighbourhood | number_of_reviews | review_scores_rating | thumbnail_url | zipcode | bedrooms | beds | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6901257 | 5.010635 | Apartment | Entire home/apt | {"Wireless Internet","Air conditioning",Kitche... | 3 | 1.0 | Real Bed | strict | True | ... | 40.696524 | -73.991617 | Beautiful brownstone 1-bedroom | Brooklyn Heights | 2 | 100.0 | https://a0.muscache.com/im/pictures/6d7cbbf7-c... | 11201 | 1.0 | 1.0 |
| 1 | 6304928 | 5.129899 | Apartment | Entire home/apt | {"Wireless Internet","Air conditioning",Kitche... | 7 | 1.0 | Real Bed | strict | True | ... | 40.766115 | -73.989040 | Superb 3BR Apt Located Near Times Square | Hell's Kitchen | 6 | 93.0 | https://a0.muscache.com/im/pictures/348a55fe-4... | 10019 | 3.0 | 3.0 |
| 2 | 7919400 | 4.976734 | Apartment | Entire home/apt | {TV,"Cable TV","Wireless Internet","Air condit... | 5 | 1.0 | Real Bed | moderate | True | ... | 40.808110 | -73.943756 | The Garden Oasis | Harlem | 10 | 92.0 | https://a0.muscache.com/im/pictures/6fae5362-9... | 10027 | 1.0 | 3.0 |
| 3 | 13418779 | 6.620073 | House | Entire home/apt | {TV,"Cable TV",Internet,"Wireless Internet",Ki... | 4 | 1.0 | Real Bed | flexible | True | ... | 37.772004 | -122.431619 | Beautiful Flat in the Heart of SF! | Lower Haight | 0 | NaN | https://a0.muscache.com/im/pictures/72208dad-9... | 94117 | 2.0 | 2.0 |
| 4 | 3808709 | 4.744932 | Apartment | Entire home/apt | {TV,Internet,"Wireless Internet","Air conditio... | 2 | 1.0 | Real Bed | moderate | True | ... | 38.925627 | -77.034596 | Great studio in midtown DC | Columbia Heights | 4 | 40.0 | NaN | 20009 | 0.0 | 1.0 |
5 rows × 29 columns
# Select 5 random samples from the DataFrame
df_sample = df.sample(n=5, random_state=42)
# Print the sampled DataFrame
df_sample
# --------------------------------------
# Select 5 random samples from the DataFrame using the sample() method
# Set the random_state to 42 for reproducibility
# --------------------------------------
| id | log_price | property_type | room_type | amenities | accommodates | bathrooms | bed_type | cancellation_policy | cleaning_fee | ... | latitude | longitude | name | neighbourhood | number_of_reviews | review_scores_rating | thumbnail_url | zipcode | bedrooms | beds | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4079 | 13662370 | 3.806662 | House | Private room | {TV,Internet,"Wireless Internet","Air conditio... | 2 | 1.5 | Real Bed | strict | True | ... | 41.849684 | -87.676270 | Pilsen Arts Community Custom Home | Pilsen | 17 | 97.0 | https://a0.muscache.com/im/pictures/81318153/a... | 60608 | 1.0 | 1.0 |
| 33735 | 4765892 | 4.941642 | Apartment | Entire home/apt | {TV,Internet,"Wireless Internet","Air conditio... | 2 | 2.0 | Real Bed | strict | True | ... | 34.068613 | -118.246455 | Apartment 5 minutes from DTLA & Dodger Stadium | Echo Park | 2 | 100.0 | https://a0.muscache.com/im/pictures/aa00250e-0... | 90012 | 1.0 | 1.0 |
| 69475 | 21169968 | 4.941642 | Apartment | Entire home/apt | {TV,"Cable TV","Wireless Internet","Air condit... | 5 | 1.0 | Real Bed | moderate | True | ... | 40.701958 | -73.917352 | Brand New Huge 2bdr apartment(L,M train 2 min) | Bushwick | 25 | 88.0 | https://a0.muscache.com/im/pictures/d9220535-c... | 11237 | 2.0 | 3.0 |
| 454 | 7939196 | 4.867534 | Apartment | Entire home/apt | {"Cable TV",Internet,"Wireless Internet","Air ... | 6 | 1.0 | Real Bed | strict | True | ... | 40.742959 | -73.990820 | Grande Super Large APT !!! | Flatiron District | 12 | 82.0 | NaN | 10010 | 1.0 | 3.0 |
| 25153 | 18161036 | 3.663562 | House | Private room | {Internet,"Wireless Internet","Air conditionin... | 2 | 1.0 | Real Bed | flexible | True | ... | 34.046473 | -117.734095 | Private Cozy and Clean Rooms in Pomona | NaN | 2 | 100.0 | https://a0.muscache.com/im/pictures/e0c9b2f9-a... | 91766 | 1.0 | 1.0 |
5 rows × 29 columns
# Basic Overview about the dataset
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 74111 entries, 0 to 74110 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 74111 non-null int64 1 log_price 74111 non-null float64 2 property_type 74111 non-null object 3 room_type 74111 non-null object 4 amenities 74111 non-null object 5 accommodates 74111 non-null int64 6 bathrooms 73911 non-null float64 7 bed_type 74111 non-null object 8 cancellation_policy 74111 non-null object 9 cleaning_fee 74111 non-null bool 10 city 74111 non-null object 11 description 74111 non-null object 12 first_review 58247 non-null object 13 host_has_profile_pic 73923 non-null object 14 host_identity_verified 73923 non-null object 15 host_response_rate 55812 non-null object 16 host_since 73923 non-null object 17 instant_bookable 74111 non-null object 18 last_review 58284 non-null object 19 latitude 74111 non-null float64 20 longitude 74111 non-null float64 21 name 74111 non-null object 22 neighbourhood 67239 non-null object 23 number_of_reviews 74111 non-null int64 24 review_scores_rating 57389 non-null float64 25 thumbnail_url 65895 non-null object 26 zipcode 73145 non-null object 27 bedrooms 74020 non-null float64 28 beds 73980 non-null float64 dtypes: bool(1), float64(7), int64(3), object(18) memory usage: 15.9+ MB
# About data shape
print(f"Number of Rows/Instances: {df.shape[0]}")
print(f"Number of Columns: {df.shape[1]}")
print('-'*30)
print(f"Number of object columns: {len(df.select_dtypes(include='object').columns)}")
print(f"Number of Int/Float columns: {len(df.select_dtypes(exclude='object').columns)}")
# --------------------------------------
# - Number of Rows/Instances: This represents the total number of homestay listings in the dataset.
# - Number of Columns: This represents the total number of features or characteristics associated with each listing.
# - Number of object columns: These columns contain categorical or textual data.
# - Number of Int/Float columns: These columns contain numerical data.
# --------------------------------------
Number of Rows/Instances: 74111 Number of Columns: 29 ------------------------------ Number of object columns: 18 Number of Int/Float columns: 11
# Define columns representing details about each column
info_cols = [
'num_of_unique_values', # Number of unique values in the column
'%_of_null_values', # Percentage of null values in the column
'dtype_of_columns', # Data type of the column
'count', # Count of non-null values
'unique', # Unique values in the column
'top', # Most frequent value
'freq', # Frequency of the most frequent value
'mean', # Mean of numerical values
'std', # Standard deviation of numerical values
'min', # Minimum value
'25%', # 25th percentile
'50%', # 50th percentile (median)
'75%', # 75th percentile
'max' # Maximum value
]
# Get the names of the original columns
cols = df.columns
# Finding all the unique values in each column
unique_values = []
# Iterate through each column and get unique values
for col in df.columns:
temp = list(df[col].unique()) # Get unique values for the column
unique_values.append(temp) # Append unique values to the list
# Calculate basic descriptive statistics for all columns
descriptive_stats = df.describe(include='all').fillna("not-applicable")
descriptive_stats
| id | log_price | property_type | room_type | amenities | accommodates | bathrooms | bed_type | cancellation_policy | cleaning_fee | ... | latitude | longitude | name | neighbourhood | number_of_reviews | review_scores_rating | thumbnail_url | zipcode | bedrooms | beds | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 74111.0 | 74111.0 | 74111 | 74111 | 74111 | 74111.0 | 73911.0 | 74111 | 74111 | 74111 | ... | 74111.0 | 74111.0 | 74111 | 67239 | 74111.0 | 57389.0 | 65895 | 73145 | 74020.0 | 73980.0 |
| unique | not-applicable | not-applicable | 35 | 3 | 67122 | not-applicable | not-applicable | 5 | 5 | 2 | ... | not-applicable | not-applicable | 73350 | 619 | not-applicable | not-applicable | 65883 | 669 | not-applicable | not-applicable |
| top | not-applicable | not-applicable | Apartment | Entire home/apt | {} | not-applicable | not-applicable | Real Bed | strict | True | ... | not-applicable | not-applicable | #NAME? | Williamsburg | not-applicable | not-applicable | https://a0.muscache.com/im/pictures/70087089/b... | 11211 | not-applicable | not-applicable |
| freq | not-applicable | not-applicable | 49003 | 41310 | 586 | not-applicable | not-applicable | 72028 | 32374 | 54403 | ... | not-applicable | not-applicable | 10 | 2862 | not-applicable | not-applicable | 3 | 1618 | not-applicable | not-applicable |
| mean | 11266617.102468 | 4.782069 | not-applicable | not-applicable | not-applicable | 3.155146 | 1.235263 | not-applicable | not-applicable | not-applicable | ... | 38.445958 | -92.397525 | not-applicable | not-applicable | 20.900568 | 94.067365 | not-applicable | not-applicable | 1.265793 | 1.710868 |
| std | 6081734.886894 | 0.717394 | not-applicable | not-applicable | not-applicable | 2.153589 | 0.582044 | not-applicable | not-applicable | not-applicable | ... | 3.080167 | 21.705322 | not-applicable | not-applicable | 37.828641 | 7.836556 | not-applicable | not-applicable | 0.852143 | 1.254142 |
| min | 344.0 | 0.0 | not-applicable | not-applicable | not-applicable | 1.0 | 0.0 | not-applicable | not-applicable | not-applicable | ... | 33.338905 | -122.5115 | not-applicable | not-applicable | 0.0 | 20.0 | not-applicable | not-applicable | 0.0 | 0.0 |
| 25% | 6261964.5 | 4.317488 | not-applicable | not-applicable | not-applicable | 2.0 | 1.0 | not-applicable | not-applicable | not-applicable | ... | 34.127908 | -118.342374 | not-applicable | not-applicable | 1.0 | 92.0 | not-applicable | not-applicable | 1.0 | 1.0 |
| 50% | 12254147.0 | 4.70953 | not-applicable | not-applicable | not-applicable | 2.0 | 1.0 | not-applicable | not-applicable | not-applicable | ... | 40.662138 | -76.996965 | not-applicable | not-applicable | 6.0 | 96.0 | not-applicable | not-applicable | 1.0 | 1.0 |
| 75% | 16402260.5 | 5.220356 | not-applicable | not-applicable | not-applicable | 4.0 | 1.0 | not-applicable | not-applicable | not-applicable | ... | 40.746096 | -73.95466 | not-applicable | not-applicable | 23.0 | 100.0 | not-applicable | not-applicable | 1.0 | 2.0 |
| max | 21230903.0 | 7.600402 | not-applicable | not-applicable | not-applicable | 16.0 | 8.0 | not-applicable | not-applicable | not-applicable | ... | 42.390437 | -70.985047 | not-applicable | not-applicable | 605.0 | 100.0 | not-applicable | not-applicable | 10.0 | 18.0 |
11 rows × 29 columns
# Create a DataFrame to store insights about the data
column_Insight_df = pd.DataFrame(
{
info_cols[0]: df.nunique().to_list(), # Number of unique values in each column
info_cols[1]: df.isnull().mean(), # Percentage of missing values in each column
info_cols[2]: list(df.dtypes), # Data types of each column
info_cols[3]: list(df.count()), # Non-null value count in each column
info_cols[4]: unique_values # Unique values in each column
},
index=cols # Column names as index
)
column_Insight_df
| num_of_unique_values | %_of_null_values | dtype_of_columns | count | unique | |
|---|---|---|---|---|---|
| id | 74111 | 0.000000 | int64 | 74111 | [6901257, 6304928, 7919400, 13418779, 3808709,... |
| log_price | 767 | 0.000000 | float64 | 74111 | [5.010635294, 5.129898715, 4.976733742, 6.6200... |
| property_type | 35 | 0.000000 | object | 74111 | [Apartment, House, Condominium, Loft, Townhous... |
| room_type | 3 | 0.000000 | object | 74111 | [Entire home/apt, Private room, Shared room] |
| amenities | 67122 | 0.000000 | object | 74111 | [{"Wireless Internet","Air conditioning",Kitch... |
| accommodates | 16 | 0.000000 | int64 | 74111 | [3, 7, 5, 4, 2, 6, 8, 1, 9, 10, 16, 11, 12, 14... |
| bathrooms | 17 | 0.002699 | float64 | 73911 | [1.0, 1.5, 2.0, nan, 2.5, 3.0, 0.5, 4.5, 5.0, ... |
| bed_type | 5 | 0.000000 | object | 74111 | [Real Bed, Futon, Pull-out Sofa, Couch, Airbed] |
| cancellation_policy | 5 | 0.000000 | object | 74111 | [strict, moderate, flexible, super_strict_30, ... |
| cleaning_fee | 2 | 0.000000 | bool | 74111 | [True, False] |
| city | 6 | 0.000000 | object | 74111 | [NYC, SF, DC, LA, Chicago, Boston] |
| description | 73474 | 0.000000 | object | 74111 | [Beautiful, sunlit brownstone 1-bedroom in the... |
| first_review | 2554 | 0.214057 | object | 58247 | [6/18/2016, 8/5/2017, 4/30/2017, nan, 5/12/201... |
| host_has_profile_pic | 2 | 0.002537 | object | 73923 | [t, nan, f] |
| host_identity_verified | 2 | 0.002537 | object | 73923 | [t, f, nan] |
| host_response_rate | 80 | 0.246913 | object | 55812 | [nan, 100%, 71%, 68%, 67%, 83%, 50%, 90%, 86%,... |
| host_since | 3087 | 0.002537 | object | 73923 | [3/26/2012, 6/19/2017, 10/25/2016, 4/19/2015, ... |
| instant_bookable | 2 | 0.000000 | object | 74111 | [f, t] |
| last_review | 1371 | 0.213558 | object | 58284 | [7/18/2016, 9/23/2017, 9/14/2017, nan, 1/22/20... |
| latitude | 74058 | 0.000000 | float64 | 74111 | [40.69652363, 40.76611542, 40.80810999, 37.772... |
| longitude | 73973 | 0.000000 | float64 | 74111 | [-73.99161685, -73.98903992, -73.94375584, -12... |
| name | 73350 | 0.000000 | object | 74111 | [Beautiful brownstone 1-bedroom, Superb 3BR Ap... |
| neighbourhood | 619 | 0.092726 | object | 67239 | [Brooklyn Heights, Hell's Kitchen, Harlem, Low... |
| number_of_reviews | 371 | 0.000000 | int64 | 74111 | [2, 6, 10, 0, 4, 3, 15, 9, 159, 82, 29, 13, 12... |
| review_scores_rating | 54 | 0.225635 | float64 | 57389 | [100.0, 93.0, 92.0, nan, 40.0, 97.0, 99.0, 90.... |
| thumbnail_url | 65883 | 0.110861 | object | 65895 | [https://a0.muscache.com/im/pictures/6d7cbbf7-... |
| zipcode | 669 | 0.013035 | object | 73145 | [11201, 10019, 10027, 94117, 20009, 94131, 902... |
| bedrooms | 11 | 0.001228 | float64 | 74020 | [1.0, 3.0, 2.0, 0.0, 4.0, nan, 5.0, 6.0, 7.0, ... |
| beds | 18 | 0.001768 | float64 | 73980 | [1.0, 3.0, 2.0, 7.0, 4.0, 6.0, 5.0, nan, 10.0,... |
# !pip install evidently
# import pandas as pd
# from sklearn import datasets
# from evidently.ui.workspace.cloud import CloudWorkspace
# from evidently.report import Report
# from evidently.metric_preset import DataQualityPreset, DataDriftPreset, TargetDriftPreset, RegressionPreset
# # Load the data for model monitoring from the provided CSV file
# monitoring_data = df.copy()
# # Rename the target column to 'target' for clarity
# monitoring_data.rename(columns={'log_price': 'target'}, inplace=True)
# # Create a CloudWorkspace instance with the provided token and URL
# workspace = CloudWorkspace(token="dG9rbgHfd9TQMR1Mup4FnQPNmSJvhu/fCmZ54q/kmiB6eZLw3wBQcLmwfGJhMDKPeCb9nntFsnf7pnH4oS5SDsmyl7k2gYB0/YmnbXVOog4/gouWHzka9rov9yNMmYIdTvBNhdbOWjOF7xpq03LzE0dQ1RJ3O8fpxyoV",
# url="https://app.evidently.cloud")
# # Create a new project within the workspace for Home Stays Model Development
# project = workspace.create_project("Home Stays Model Dev")
# # Add a description to the project
# project.description = "Developing the model for production"
# # Save the changes to the project
# project.save()
# # Splitting the training data into reference data and current data
# reference_data = monitoring_data[:int(len(monitoring_data) * 0.8)] # 80% of the training data for reference
# current_data = monitoring_data[int(len(monitoring_data) * 0.8):] # Remaining 20% of the training data for current analysis
# # Create a data report object with data quality metrics
# data_report = Report(
# metrics=[
# DataQualityPreset(),
# ]
# )
# # Run the data quality metrics on the reference and current data
# data_report.run(reference_data=reference_data, current_data=current_data)
# # Add the data report to the project in the workspace
# workspace.add_report(project_id=project.id, report=data_report)
# data_report
# data_drift_report = Report(
# metrics=[
# DataDriftPreset(),
# ],
# )
# data_drift_report.run(reference_data=reference_data, current_data=current_data)
# workspace.add_report(project.id, data_drift_report)
# data_drift_report
# num_target_drift_report = Report(metrics=[
# TargetDriftPreset(),
# ])
# num_target_drift_report.run(reference_data=reference_data, current_data=current_data)
# workspace.add_report(project.id, num_target_drift_report)
# num_target_drift_report
# from evidently.metric_preset import RegressionPreset
# reg_performance_report = Report(metrics=[
# RegressionPreset(),
# ])
# reg_performance_report.run(reference_data=reference_data, current_data=current_data)
# workspace.add_report(project.id, reg_performance_report)
# reg_performance_report
Cleaning the data to ensure to its accuracy and usability:¶
1. Identifying and handle missing values (deletion, imputation).
2. Correcting the errors and inconsistencies.
3. Standardizeing data formats (dates, currencies, etc.).
4. Removeing the irrelevant or duplicate data points.
# Method 1: Frequency Table and Visualization
def check_outliers_freq_table(data, col):
# Create frequency table
frequency_table = data[col].value_counts().reset_index(name='Frequency')
frequency_table['Relative Frequency'] = frequency_table['Frequency'] / len(data)
# Print frequency table
print("Frequency Table:")
print(frequency_table.to_string(index=True))
# Visualize distribution (optional)
frequency_table.plot(kind='bar', x=f'{col}', y='Frequency', title='Frequency Distribution')
plt.show()
# Identify potential outliers based on frequency (adjust threshold as needed)
outlier_threshold = 0.1 # Adjust this based on your data and analysis goals
outliers = frequency_table[frequency_table['Relative Frequency'] < outlier_threshold][col].tolist()
print("\nPotential outliers based on frequency (less than", outlier_threshold, "relative frequency):")
print(outliers)
# Method 2: IQR (for ordinal data)
def check_outliers_iqr(data):
if not pd.api.types.is_categorical_dtype(data):
print("IQR method is only applicable for ordinal data (categories with natural order).")
return
# Check if categories have an order (optional)
categories = data.unique()
if not all(categories[i] <= categories[i+1] for i in range(len(categories)-1)):
print("Categories in your data do not seem to have a natural order. IQR method might not be suitable.")
return
# Calculate IQR
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
# Define upper and lower bounds
upper_bound = Q3 + (1.5 * IQR)
lower_bound = Q1 - (1.5 * IQR)
# Identify outliers using IQR rule
outliers = data[(data > upper_bound) | (data < lower_bound)].tolist()
print("\nPotential outliers based on IQR rule:")
print(outliers)
def plot_kde(data, column, shade=True, color="b", bw_method=0.5, xlabel=None, ylabel=None):
"""
Create a KDE plot using Seaborn.
Parameters:
- data (DataFrame): The DataFrame containing the data to be plotted.
- column (str): The column name for which KDE plot will be created.
- shade (bool, optional): Whether to shade the area under the KDE curve. Default is True.
- color (str, optional): The color of the KDE curve. Default is "b" (blue).
- bw_method (float, optional): The bandwidth of the KDE. Default is 0.5.
- xlabel (str, optional): Label for the x-axis. Default is None (uses column name).
- ylabel (str, optional): Label for the y-axis. Default is None.
Returns:
- None
"""
sns.set(style="whitegrid") # Setting the style
plt.figure(figsize=(10, 8)) # Setting figure size
# Creating KDE plot
sns.kdeplot(data[column], shade=shade, color=color, bw_method=bw_method)
# Adding labels and title
if not xlabel:
xlabel = column # Use column name as xlabel if not specified
plt.xlabel(xlabel)
if ylabel:
plt.ylabel(ylabel)
plt.title(f"{column} KDE Plot") # Adding column name to the title
# Display the plot
plt.show()
def plot_kde_before_after_imputation(original_data, imputed_data, column, bw_method=0.5):
"""
Create a KDE plot comparing the distribution before and after imputation.
Parameters:
- original_data (Series or DataFrame): The original data before imputation.
- imputed_data (Series or DataFrame): The imputed data after imputation.
- column (str): The column name for which KDE plot will be created.
- bw_method (float, optional): The bandwidth of the KDE. Default is 0.5.
Returns:
- None
"""
sns.set(style="whitegrid") # Setting the style
plt.figure(figsize=(10, 6)) # Setting figure size
# Before imputation (original data)
sns.kdeplot(original_data[column], label='Before Imputation', shade=True, color="b", bw_method=bw_method)
# After imputation
sns.kdeplot(imputed_data, label='After Imputation', linestyle='--', shade=True, color="g", bw_method=bw_method)
# Customize the plot
plt.title(f'Distribution of {column} (Before and After Imputation)')
plt.xlabel(column)
plt.ylabel('Density')
plt.legend(title='Data')
# Show the plot
plt.show()
def create_seaborn_countplot(data, x, palette='viridis', title='Count Plot'):
"""
Create a count plot using Seaborn with a specified color palette.
Parameters:
- data (DataFrame): The DataFrame containing the data to be plotted.
- x (str): The column name of the categorical variable.
- palette (str or list of colors, optional): The color palette to use. Default is 'viridis'.
- title (str, optional): The title of the plot. Default is 'Count Plot'.
Returns:
- None
"""
plt.figure(figsize=(33, 16))
sns.countplot(x=x, data=data, palette=palette)
plt.title(title)
plt.show()
import plotly.express as px
def create_plotly_countplot(data, x, color='green', title='Count Plot'):
"""
Create a count plot using Plotly Express with a specified color.
Parameters:
- data (DataFrame): The DataFrame containing the data to be plotted.
- x (str): The column name of the categorical variable.
- color (str, optional): The color to use. Default is 'green'.
- title (str, optional): The title of the plot. Default is 'Count Plot'.
Returns:
- None
"""
fig = px.histogram(data, x=x, color_discrete_sequence=[color],text_auto=True)
fig.update_layout(title=title)
fig.show()
def plot_barplot(df, xcol, ycol, title=None):
"""
Create a barplot for a specified column in a DataFrame.
Parameters:
df (DataFrame): The input DataFrame.
xcol (str): The name of the column to use for the x-axis.
ycol (str): The name of the column to use for the y-axis.
title (str, optional): The title of the plot. Defaults to None.
Returns:
matplotlib.axes._subplots.AxesSubplot: The created barplot.
"""
# Set Seaborn style
sns.set_style("whitegrid")
# Create barplot
plt.figure(figsize=(12, 8)) # Adjust size as needed
ax = sns.barplot(x=df[xcol], y=df[ycol])
# Add labels and title
ax.set_xlabel(xcol)
ax.set_ylabel('Count')
ax.set_title(title if title else f'Barplot of {xcol}')
# Rotate x-axis labels if needed
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
# Show plot
plt.show()
return ax
def plot_correlation_heatmap(df, columns):
"""
Plot a heatmap showing the correlation between specified columns in a DataFrame.
Parameters:
df (DataFrame): The input DataFrame.
columns (list): A list of column names to include in the heatmap.
Returns:
None
"""
# Set Seaborn style
sns.set_style("whitegrid")
# Calculate correlation matrix
corr_matrix = df[columns].corr()
# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
# Add title
plt.title('Correlation Heatmap')
# Show plot
plt.show()
def calculate_mode(df, groupby_column, aggregate_columns):
"""
Calculate the mode for each specified column grouped by another column.
Parameters:
df (DataFrame): The input DataFrame.
groupby_column (str): The name of the column to group by.
aggregate_columns (list): A list of column names to calculate the mode for.
Returns:
DataFrame: A DataFrame containing the modes for each specified column grouped by the groupby_column.
"""
mode_data = {}
for column in aggregate_columns:
mode_data[f'mode_{column}'] = df.groupby(groupby_column)[column].agg(lambda x: x.mode().iloc[0])
return pd.DataFrame(mode_data)
import seaborn as sns
import matplotlib.pyplot as plt
def plot_correlation_heatmap(df, columns):
"""
Plot a heatmap showing the correlation between specified columns in a DataFrame.
Parameters:
df (DataFrame): The input DataFrame.
columns (list): A list of column names to include in the heatmap.
Returns:
None
"""
# Set Seaborn style
sns.set_style("whitegrid")
# Calculate correlation matrix
corr_matrix = df[columns].corr()
# Plot heatmap
plt.figure(figsize=(33, 16))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
# Add title
plt.title('Correlation Heatmap')
# Show plot
plt.show()
def plot_lineplot(df, x_column, y_column):
"""
Create a line plot for two specified columns in a DataFrame.
Parameters:
df (DataFrame): The input DataFrame.
x_column (str): The name of the column for the x-axis.
y_column (str): The name of the column for the y-axis.
Returns:
None
"""
# Set Seaborn style
sns.set_style("whitegrid")
# Create line plot
plt.figure(figsize=(33, 16)) # Adjust size as needed
sns.lineplot(data=df, x=x_column, y=y_column)
# Add labels and title
plt.xlabel(x_column)
plt.ylabel(y_column)
plt.title(f'Line Plot of {y_column} vs {x_column}')
# Show plot
plt.show()
def analyze_skewness(data, column):
"""
Calculate skewness of a column in a DataFrame and provide interpretation.
Parameters:
- data (DataFrame): The DataFrame containing the data.
- column (str): The column name for which skewness will be calculated and interpreted.
Returns:
- None
"""
skewness = data[column].skew()
print("Skewness of the dataset:", skewness)
# Interpretation
if skewness < -1:
print("The distribution is highly left-skewed.")
elif -1 <= skewness < -0.5:
print("The distribution is moderately left-skewed.")
elif -0.5 <= skewness < 0.5:
print("The distribution is approximately symmetric.")
elif 0.5 <= skewness < 1:
print("The distribution is moderately right-skewed.")
else:
print("The distribution is highly right-skewed.")
def random_imputation(df, column):
"""
Perform random imputation for missing values in a DataFrame column.
Parameters:
- df (DataFrame): The input DataFrame.
- column (str): The column name containing missing values to be imputed.
Returns:
- df_imputed (DataFrame): The DataFrame with missing values imputed using random sampling.
"""
# Get indices of missing values in the specified column
missing_indices = df[column].isnull()
# Get non-missing values in the specified column
non_missing_values = df.loc[~missing_indices, column]
# Randomly sample non-missing values to impute missing values
imputed_values = non_missing_values.sample(n=missing_indices.sum(), replace=True)
# Update missing values in the specified column
df_imputed = df.copy()
df_imputed.loc[missing_indices, column] = imputed_values.values
return df_imputed
def find_nearest_zipcode(latitude, longitude):
"""
Find the nearest zip code based on latitude and longitude coordinates.
Parameters:
- latitude (float): Latitude coordinate.
- longitude (float): Longitude coordinate.
Returns:
- nearest_zipcode (str): Zip code of the nearest location.
"""
# Create geolocator object
geolocator = Nominatim(user_agent="nearest_zipcode_finder")
# Get the location information based on latitude and longitude
location = geolocator.reverse((latitude, longitude), exactly_one=True)
# Extract zip code from location information
nearest_zipcode = location.raw['address'].get('postcode')
return nearest_zipcode
def knn_impute(df, column_name, n_neighbors=5):
"""
Perform k-Nearest Neighbors (kNN) imputation on a specified column in a DataFrame.
Parameters:
df (DataFrame): The input DataFrame.
column_name (str): The name of the column to impute.
n_neighbors (int): The number of neighbors to consider for imputation.
Returns:
DataFrame: The DataFrame with missing values in the specified column imputed using kNN imputation.
"""
# Create a copy of the DataFrame with only the specified column
df_missing = df[[column_name]].copy()
# Instantiate the KNNImputer with the desired number of neighbors
imputer = KNNImputer(n_neighbors=n_neighbors)
# Fit the imputer on the data with missing values and transform the data
imputed_data = imputer.fit_transform(df_missing)
# Convert the imputed data back to a DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=[column_name])
# Round and convert to integer
imputed_df[column_name] = imputed_df[column_name].round().astype(int)
# Replace the missing values in the original DataFrame with the imputed values
df[column_name] = imputed_df[column_name]
return df
# Method 1: Using isnull().any()
null_columns_1 = df.columns[df.isnull().any()]
print(null_columns_1)
Index(['first_review', 'host_has_profile_pic', 'host_identity_verified',
'host_response_rate', 'host_since', 'last_review', 'neighbourhood',
'review_scores_rating', 'thumbnail_url', 'zipcode', 'bedrooms', 'beds'],
dtype='object')
df.isnull().sum()
id 0 log_price 0 property_type 0 room_type 0 amenities 0 accommodates 0 bathrooms 0 bed_type 0 cancellation_policy 0 cleaning_fee 0 city 0 description 0 first_review 15864 host_has_profile_pic 188 host_identity_verified 188 host_response_rate 18299 host_since 188 instant_bookable 0 last_review 15827 latitude 0 longitude 0 name 0 neighbourhood 6872 number_of_reviews 0 review_scores_rating 16722 thumbnail_url 8216 zipcode 966 bedrooms 91 beds 131 amenities_count 0 has_wireless_internet 0 has_kitchen 0 has_heating 0 has_essentials 0 has_smoke_detector 0 dtype: int64
Note
In real estate or housing datasets, having fractional values in the number of bathrooms can represent scenarios like having a bathroom with both a bathtub and a shower (counted as 1.5 bathrooms) or having a bathroom with additional features like a jacuzzi (counted as 2.5 bathrooms).
If you want to handle these fractional values, you can either:
so i going with method 2 (i.e : Keep them as is)
plot_kde(df, column='bathrooms', shade=True, color="b", bw_method=0.5, ylabel="Density")
analyze_skewness(df, column='bathrooms')
Skewness of the dataset: 3.475813822494678 The distribution is highly right-skewed.
df = knn_impute(df, 'bathrooms', n_neighbors=5)
df['bathrooms']
0 1
1 1
2 1
3 1
4 1
..
74106 1
74107 2
74108 1
74109 1
74110 1
Name: bathrooms, Length: 74111, dtype: int32
# check_outliers_freq_table(df,'bathrooms')
# check_outliers_iqr(df['bathrooms'])
# df[['host_has_profile_pic','host_identity_verified']].fillna('m',inplace=True) This one is not working
# reason: the there was a change in synatx of null values hence it doesnt work
# m : Missing value
df[['host_has_profile_pic','host_identity_verified']] = df[['host_has_profile_pic','host_identity_verified']].replace(np.nan, 'm')
# Changed the names of the values
df[['host_has_profile_pic','host_identity_verified']] = df[['host_has_profile_pic','host_identity_verified']].replace({'t':True, 'f': False , 'm':"missing"})
create_seaborn_countplot(df, 'host_has_profile_pic')
create_seaborn_countplot(df, 'host_identity_verified')
plot_kde(df, column='beds', shade=True, color="b", bw_method=0.5, ylabel="Density")
analyze_skewness(df, column='beds')
Skewness of the dataset: 3.358000200663669 The distribution is highly right-skewed.
df = knn_impute(df, 'beds', n_neighbors=5)
# check_outliers_freq_table(df,'beds')
# check_outliers_iqr(df['beds'])
plot_kde(df, column='bedrooms', shade=True, color="b", bw_method=0.5, ylabel="Density")
analyze_skewness(df, column='bedrooms')
Skewness of the dataset: 1.989848743689331 The distribution is highly right-skewed.
df = knn_impute(df, 'bedrooms', n_neighbors=5)
# check_outliers_freq_table(df,'bedrooms')
# check_outliers_iqr(df['bedrooms'])
df['review_scores_rating'].value_counts()
100.0 16215 98.0 4374 97.0 4087 96.0 4081 95.0 3713 93.0 3647 90.0 2852 99.0 2631 94.0 2618 80.0 2163 92.0 2064 91.0 1615 89.0 1120 87.0 1119 88.0 1056 85.0 625 86.0 512 60.0 444 84.0 438 83.0 403 82.0 211 70.0 196 73.0 157 81.0 126 75.0 101 20.0 97 78.0 94 40.0 90 79.0 83 76.0 76 77.0 74 67.0 66 74.0 39 72.0 38 50.0 30 65.0 28 68.0 20 71.0 14 69.0 13 63.0 11 53.0 10 64.0 10 47.0 5 30.0 4 62.0 3 66.0 3 55.0 3 57.0 3 27.0 2 35.0 1 49.0 1 58.0 1 54.0 1 56.0 1 Name: review_scores_rating, dtype: int64
plot_kde(df, column='review_scores_rating', shade=True, color="b", bw_method=0.5, ylabel="Density")
analyze_skewness(df, column='review_scores_rating')
Skewness of the dataset: -3.3808606233341605 The distribution is highly left-skewed.
df = knn_impute(df, 'review_scores_rating', n_neighbors=5)
# df = df.drop('thumbnail_url', axis=1)
Droping this column because it has no significance in the dataset
df['host_response_rate'] = df['host_response_rate'].str.split("%").str[0]
df = random_imputation(df, column='host_response_rate')
df['host_response_rate'] = df['host_response_rate'].astype('int')
Hence data is continous so better to use random imputation becasuse it adds less bais
df[['last_review','host_since','first_review']]
| last_review | host_since | first_review | |
|---|---|---|---|
| 0 | 7/18/2016 | 3/26/2012 | 6/18/2016 |
| 1 | 9/23/2017 | 6/19/2017 | 8/5/2017 |
| 2 | 9/14/2017 | 10/25/2016 | 4/30/2017 |
| 3 | NaN | 4/19/2015 | NaN |
| 4 | 1/22/2017 | 3/1/2015 | 5/12/2015 |
| ... | ... | ... | ... |
| 74106 | NaN | 3/24/2013 | NaN |
| 74107 | 4/15/2017 | 5/3/2016 | 8/15/2016 |
| 74108 | 9/10/2017 | 1/5/2012 | 1/3/2015 |
| 74109 | NaN | 9/17/2017 | NaN |
| 74110 | 4/30/2017 | 11/26/2012 | 9/5/2013 |
74111 rows × 3 columns
# Convert multiple columns to datetime format
df[['last_review', 'host_since', 'first_review']] = df[['last_review', 'host_since', 'first_review']].apply(pd.to_datetime)
df[['last_review', 'host_since', 'first_review']].isnull().sum()
last_review 15827 host_since 188 first_review 15864 dtype: int64
# Forward fill missing datetime values
df['last_review'] = df['last_review'].fillna(method='ffill')
df['host_since'] = df['host_since'].fillna(method='ffill')
df['first_review'] = df['first_review'].fillna(method='ffill')
since all the columns are datetime
in the begining i used mode imputation but i dont think it is suitable for that task soo
i used forward fill (each missing value is replaced with the last observed non-null value in the same column)
first_review and last_review: Extract features such as review duration or time since last review.
host_since: Calculate the tenure of the host.
# Feature engineering for review duration
# Calculate the duration between the first and last review for each listing, in days
df['review_duration'] = (df['last_review'] - df['first_review']).dt.days
# Feature engineering for time since last review
# Calculate the number of days since the last review was posted
df['time_since_last_review'] = (pd.to_datetime('today') - df['last_review']).dt.days
# Feature engineering for host tenure
# Calculate the tenure of each host measured in days since they joined the platform
df['host_tenure'] = (pd.to_datetime('today') - df['host_since']).dt.days
number_of_reviews and review_scores_rating: Calculate a new feature representing the average review score.
# Calculate the average review score for each number_of_reviews group
average_review_score = df.groupby('number_of_reviews')['review_scores_rating'].mean().reset_index()
average_review_score.rename(columns={'review_scores_rating': 'average_review_score'}, inplace=True)
# Merge the average_review_score back to the original DataFrame
df = pd.merge(df, average_review_score, on='number_of_reviews', how='left')
Extract information from the amenities list, such as the count of amenities or specific amenities present.
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Function to extract amenities from JSON string
def extract_amenities(amenities_str):
# Remove braces and quotes, then split by comma to get individual amenities
amenities = amenities_str.strip('{}').replace('"', '').split(',')
return amenities
# Extract amenities from all rows and collect unique amenities into a set
all_amenities = set()
for amenities_str in df['amenities']:
amenities_list = extract_amenities(amenities_str)
all_amenities.update(amenities_list)
# Create a string with all amenities separated by spaces
amenities_text = ' '.join(all_amenities)
# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color ='white').generate(amenities_text)
# Display the generated word cloud
plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
def count_amenities(amenities_str):
"""
Function to count the number of amenities
"""
# Remove braces and quotes, then split by comma to get individual amenities
amenities = amenities_str.strip('{}').replace('"', '').split(',')
return len(amenities)
# Count the number of amenities for each listing
df['amenities_count'] = df['amenities'].apply(count_amenities)
def extract_amenities(amenities_str):
"""
Extracts amenities from a JSON string and creates a list of all amenities.
Parameters:
amenities_str (str): JSON string containing amenities.
Returns:
list: List of amenities extracted from the JSON string.
"""
amenities = amenities_str.strip('{}').replace('"', '').split(',')
return amenities
# Extract all amenities and create a list of all amenities across all listings
all_amenities = []
for amenities_str in df['amenities']:
all_amenities.extend(extract_amenities(amenities_str))
# Calculate the frequency of each amenity
amenities_frequency = pd.Series(all_amenities).value_counts()
# Select the top 5 most common amenities
top_5_amenities = amenities_frequency.head(5).index.tolist()
print("Top 5 most common amenities:")
print(top_5_amenities)
# copying the list of important amenities
important_amenities = top_5_amenities
def check_important_amenity(amenities_str, important_amenity):
"""
Checks if an important amenity is present in the amenities string.
Parameters:
amenities_str (str): String containing amenities.
important_amenity (str): The important amenity to check for.
Returns:
int: 1 if the important amenity is present, 0 otherwise.
"""
return 1 if important_amenity in amenities_str else 0
# Create binary columns for each important amenity with a prefix
prefix = 'has_'
for amenity in important_amenities:
column_name = prefix + amenity.lower().replace(' ', '_')
df[column_name] = df['amenities'].apply(lambda x: check_important_amenity(x, amenity))
Top 5 most common amenities: ['Wireless Internet', 'Kitchen', 'Heating', 'Essentials', 'Smoke detector']
create_seaborn_countplot(df, 'property_type')
- as we can see Apartment,House,condominium are top 3 properties; obviously reasons is most of the type of property are apartments in cities
create_seaborn_countplot(df, 'room_type')
- family: choose entire home/apt
- couples/singles: choose private room
- friends : shared rooms
create_seaborn_countplot(df, 'bed_type')
We can see almost 97% rooms/homes are equiped with realbeds
create_seaborn_countplot(df, 'cancellation_policy')
we can observe there are 5levels of different cancellation policecs among them
strict, moderate & flexibleare more
create_seaborn_countplot(df, 'city')
Newyork city is most of the apartments/homes fallowed by Los angels
san francisco - DC - have some what equal shares
chicago & Boston also shares equal in numbers
df['zipcode'] = df['zipcode'].str.split("-").str[0].str[:].str.split("\r").str[0]
# print('55555-57484\r\r\r\r\r\r\n\r\r\r\r\r\r\n\r\r\r\r\r\r\n94158'.split("-")[0].split("\r").str[0]) #Sample viz of above code
create_seaborn_countplot(df, 'zipcode')
we can observe wide range of irregular pattern in the zipline count plot
create_seaborn_countplot(df, 'neighbourhood')
create_seaborn_countplot(df, 'host_has_profile_pic')
most all them have profile pic
create_seaborn_countplot(df, 'host_identity_verified')
Most the hosts are verifed(49.0k) and some them are not verified(24k)
create_seaborn_countplot(df, 'instant_bookable')
Nearly 1/3 of them are not offering instanly bookable
df.select_dtypes(exclude='object').describe()
| id | log_price | accommodates | bathrooms | host_response_rate | latitude | longitude | number_of_reviews | review_scores_rating | bedrooms | ... | amenities_count | has_wireless_internet | has_kitchen | has_heating | has_essentials | has_smoke_detector | review_duration | time_since_last_review | host_tenure | average_review_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7.411100e+04 | 74111.000000 | 74111.000000 | 74111.000000 | 74111.000000 | 74111.000000 | 74111.000000 | 74111.000000 | 74111.000000 | 74111.000000 | ... | 74111.000000 | 74111.000000 | 74111.000000 | 74111.000000 | 74111.000000 | 74111.000000 | 74111.000000 | 74111.000000 | 74111.000000 | 74111.000000 |
| mean | 1.126662e+07 | 4.782069 | 3.155146 | 1.250624 | 94.326011 | 38.445958 | -92.397525 | 20.900568 | 94.052165 | 1.265467 | ... | 17.602407 | 0.961598 | 0.911147 | 0.905034 | 0.863637 | 0.832899 | 425.806129 | 2610.258383 | 3577.994090 | 94.052165 |
| std | 6.081735e+06 | 0.717394 | 2.153589 | 0.597852 | 16.387421 | 3.080167 | 21.705322 | 37.828641 | 6.896060 | 0.851671 | ... | 6.936947 | 0.192166 | 0.284534 | 0.293169 | 0.343176 | 0.373068 | 490.276735 | 231.118332 | 661.467516 | 0.414158 |
| min | 3.440000e+02 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 33.338905 | -122.511500 | 0.000000 | 20.000000 | 0.000000 | ... | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -1337.000000 | 2406.000000 | 2407.000000 | 86.000000 |
| 25% | 6.261964e+06 | 4.317488 | 2.000000 | 1.000000 | 100.000000 | 34.127908 | -118.342374 | 1.000000 | 93.000000 | 1.000000 | ... | 13.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 52.000000 | 2433.000000 | 3059.000000 | 93.814894 |
| 50% | 1.225415e+07 | 4.709530 | 2.000000 | 1.000000 | 100.000000 | 40.662138 | -76.996965 | 6.000000 | 94.000000 | 1.000000 | ... | 17.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 264.000000 | 2566.000000 | 3509.000000 | 94.000000 |
| 75% | 1.640226e+07 | 5.220356 | 4.000000 | 1.000000 | 100.000000 | 40.746096 | -73.954660 | 23.000000 | 99.000000 | 1.000000 | ... | 22.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 622.500000 | 2675.000000 | 4035.000000 | 94.357104 |
| max | 2.123090e+07 | 7.600402 | 16.000000 | 8.000000 | 100.000000 | 42.390437 | -70.985047 | 605.000000 | 100.000000 | 10.000000 | ... | 86.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 3106.000000 | 5585.000000 | 5909.000000 | 100.000000 |
8 rows × 21 columns
create_seaborn_countplot(df, 'accommodates')
create_seaborn_countplot(df, 'bathrooms')
create_seaborn_countplot(df, 'cleaning_fee')
plot_kde(df, 'host_response_rate')
plot_kde(df, 'number_of_reviews')
plot_kde(df, 'review_scores_rating')
create_seaborn_countplot(df, 'bedrooms')
create_seaborn_countplot(df, 'beds')
plot_kde(df, 'time_since_last_review')
plot_kde(df, 'host_tenure')
plot_kde(df, 'average_review_score')
create_seaborn_countplot(df,'amenities_count' )
create_seaborn_countplot(df, 'has_wireless_internet')
create_seaborn_countplot(df, 'has_kitchen')
create_seaborn_countplot(df, 'has_essentials')
create_seaborn_countplot(df, 'has_heating')
create_seaborn_countplot(df, 'has_smoke_detector')
# Calculating the mean log price for each property type
mean_log_price_wrt_pt = df.groupby(df['property_type'])['log_price'].mean().to_frame().reset_index() # return the dataframe of property-type & mean log-price
plot_barplot(mean_log_price_wrt_pt, 'property_type', 'log_price')
<AxesSubplot: title={'center': 'Barplot of property_type'}, xlabel='property_type', ylabel='Count'>
# Calculating the mean log price for each property type
mean_log_price_wrt_rt = df.groupby(df['room_type'])['log_price'].mean().to_frame().reset_index() # return the dataframe of property-type & mean log-price
plot_barplot(mean_log_price_wrt_rt, 'room_type', 'log_price')
<AxesSubplot: title={'center': 'Barplot of room_type'}, xlabel='room_type', ylabel='Count'>
mean_log_price_wrt_acc = df.groupby(df['accommodates'])['log_price'].mean().to_frame().reset_index() # return the dataframe of accommodates & mean log-price
plot_barplot(mean_log_price_wrt_acc, 'accommodates', 'log_price')
<AxesSubplot: title={'center': 'Barplot of accommodates'}, xlabel='accommodates', ylabel='Count'>
mean_log_price_wrt_br = df.groupby(df['bathrooms'])['log_price'].mean().to_frame().reset_index() # return the dataframe of bathrooms & mean log-price
plot_barplot(mean_log_price_wrt_br, 'bathrooms', 'log_price')
<AxesSubplot: title={'center': 'Barplot of bathrooms'}, xlabel='bathrooms', ylabel='Count'>
mean_log_price_wrt_bt = df.groupby(df['bed_type'])['log_price'].mean().to_frame().reset_index() # return the dataframe of bathrooms & mean log-price
plot_barplot(mean_log_price_wrt_bt, 'bed_type', 'log_price')
<AxesSubplot: title={'center': 'Barplot of bed_type'}, xlabel='bed_type', ylabel='Count'>
mean_log_price_wrt_cp = df.groupby(df['cancellation_policy'])['log_price'].mean().to_frame().reset_index() # return the dataframe of bathrooms & mean log-price
plot_barplot(mean_log_price_wrt_cp, 'cancellation_policy', 'log_price')
<AxesSubplot: title={'center': 'Barplot of cancellation_policy'}, xlabel='cancellation_policy', ylabel='Count'>
mean_log_price_wrt_cf = df.groupby(df['cleaning_fee'])['log_price'].mean().to_frame().reset_index() # return the dataframe of bathrooms & mean log-price
plot_barplot(mean_log_price_wrt_cf, 'cleaning_fee', 'log_price')
<AxesSubplot: title={'center': 'Barplot of cleaning_fee'}, xlabel='cleaning_fee', ylabel='Count'>
mean_log_price_wrt_ib = df.groupby(df['instant_bookable'])['log_price'].mean().to_frame().reset_index() # return the dataframe of bathrooms & mean log-price
plot_barplot(mean_log_price_wrt_ib, 'instant_bookable', 'log_price')
<AxesSubplot: title={'center': 'Barplot of instant_bookable'}, xlabel='instant_bookable', ylabel='Count'>
mean_log_price_wrt_cty = df.groupby(df['city'])['log_price'].mean().to_frame().reset_index() # return the dataframe of bathrooms & mean log-price
plot_barplot(mean_log_price_wrt_cty, 'city', 'log_price')
<AxesSubplot: title={'center': 'Barplot of city'}, xlabel='city', ylabel='Count'>
mean_log_price_wrt_bedr = df.groupby(df['bedrooms'])['log_price'].mean().to_frame().reset_index() # return the dataframe of bathrooms & mean log-price
plot_barplot(mean_log_price_wrt_bedr, 'bedrooms', 'log_price')
<AxesSubplot: title={'center': 'Barplot of bedrooms'}, xlabel='bedrooms', ylabel='Count'>
columns_to_include = ['amenities_count','log_price']
# Plot correlation heatmap
plot_correlation_heatmap(df, columns_to_include)
mean_log_price_wrt_ac = df.groupby(df['amenities_count'])['log_price'].mean().to_frame().reset_index() # return the dataframe of bathrooms & mean log-price
plot_barplot(mean_log_price_wrt_ac, 'amenities_count', 'log_price')
<AxesSubplot: title={'center': 'Barplot of amenities_count'}, xlabel='amenities_count', ylabel='Count'>
# Example usage:
aggregate_columns = ['property_type','room_type', 'accommodates','bathrooms', 'bedrooms','neighbourhood', 'cancellation_policy']
mode_data_city = calculate_mode(df, 'city', aggregate_columns)
mode_data_city
| mode_property_type | mode_room_type | mode_accommodates | mode_bathrooms | mode_bedrooms | mode_neighbourhood | mode_cancellation_policy | |
|---|---|---|---|---|---|---|---|
| city | |||||||
| Boston | Apartment | Entire home/apt | 2 | 1 | 1 | Allston-Brighton | strict |
| Chicago | Apartment | Entire home/apt | 2 | 1 | 1 | Lakeview | strict |
| DC | Apartment | Entire home/apt | 2 | 1 | 1 | Capitol Hill | flexible |
| LA | Apartment | Entire home/apt | 2 | 1 | 1 | Mid-Wilshire | strict |
| NYC | Apartment | Entire home/apt | 2 | 1 | 1 | Williamsburg | strict |
| SF | Apartment | Entire home/apt | 2 | 1 | 1 | Mission District | strict |
columns_to_include = ['host_tenure','log_price']
# Plot correlation heatmap
plot_correlation_heatmap(df, columns_to_include)
columns_to_include = ['log_price','accommodates', 'bathrooms','number_of_reviews',
'review_scores_rating', 'bedrooms', 'beds',
'review_duration', 'time_since_last_review', 'host_tenure',
'average_review_score', 'amenities_count', 'has_wireless_internet',
'has_kitchen', 'has_heating', 'has_essentials', 'has_smoke_detector']
corr_df = df[columns_to_include].corr()
# Plot correlation heatmap
plot_correlation_heatmap(df, columns_to_include)
correlatiomn_matrix = corr_df['log_price'].sort_values(ascending=False).to_frame().reset_index()
correlatiomn_matrix
| index | log_price | |
|---|---|---|
| 0 | log_price | 1.000000 |
| 1 | accommodates | 0.567574 |
| 2 | bedrooms | 0.473028 |
| 3 | beds | 0.441619 |
| 4 | bathrooms | 0.335589 |
| 5 | amenities_count | 0.194528 |
| 6 | has_heating | 0.103929 |
| 7 | has_kitchen | 0.101046 |
| 8 | host_tenure | 0.078244 |
| 9 | review_scores_rating | 0.074475 |
| 10 | has_essentials | 0.053435 |
| 11 | review_duration | 0.052769 |
| 12 | has_smoke_detector | 0.046862 |
| 13 | has_wireless_internet | 0.034074 |
| 14 | time_since_last_review | 0.015599 |
| 15 | average_review_score | 0.007285 |
| 16 | number_of_reviews | -0.032470 |
plot_barplot(correlatiomn_matrix.iloc[1:], 'index', 'log_price') # Correlation matrix
<AxesSubplot: title={'center': 'Barplot of index'}, xlabel='index', ylabel='Count'>
# Example usage:
plot_lineplot(df, 'first_review','log_price' )
plot_lineplot(df, 'host_since','log_price' )
plot_lineplot(df, 'last_review','log_price' )
df[['property_type','room_type']]
| property_type | room_type | |
|---|---|---|
| 0 | Apartment | Entire home/apt |
| 1 | Apartment | Entire home/apt |
| 2 | Apartment | Entire home/apt |
| 3 | House | Entire home/apt |
| 4 | Apartment | Entire home/apt |
| ... | ... | ... |
| 74106 | Apartment | Private room |
| 74107 | Apartment | Entire home/apt |
| 74108 | Apartment | Entire home/apt |
| 74109 | Apartment | Entire home/apt |
| 74110 | Boat | Entire home/apt |
74111 rows × 2 columns
pt_rt_df = df.groupby(['property_type','room_type'])['room_type'].count().to_frame()
pt_rt_df.columns = ['room_type_count']
pt_rt_df = pt_rt_df.reset_index()
# Set Seaborn style
sns.set_style("whitegrid")
# Plot unstacked bar chart
plt.figure(figsize=(22, 16))
for room_type in pt_rt_df['room_type'].unique():
subset = pt_rt_df[pt_rt_df['room_type'] == room_type]
plt.bar(subset['property_type'], subset['room_type_count'], label=room_type)
plt.title('Counts of Property Types by Room Types')
plt.xlabel('Property Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Room Type')
plt.show()
plot_barplot(df, 'property_type','accommodates')
<AxesSubplot: title={'center': 'Barplot of property_type'}, xlabel='property_type', ylabel='Count'>
pt_cp_df = df.groupby(['property_type','cancellation_policy'])['cancellation_policy'].count().to_frame()
pt_cp_df.columns = ['cancellation_policy_count']
pt_cp_df = pt_cp_df.reset_index()
pt_cp_df
| property_type | cancellation_policy | cancellation_policy_count | |
|---|---|---|---|
| 0 | Apartment | flexible | 15288 |
| 1 | Apartment | moderate | 12318 |
| 2 | Apartment | strict | 21301 |
| 3 | Apartment | super_strict_30 | 96 |
| 4 | Bed & Breakfast | flexible | 187 |
| ... | ... | ... | ... |
| 92 | Villa | strict | 104 |
| 93 | Villa | super_strict_60 | 5 |
| 94 | Yurt | flexible | 6 |
| 95 | Yurt | moderate | 2 |
| 96 | Yurt | strict | 1 |
97 rows × 3 columns
# Set Seaborn style
sns.set_style("whitegrid")
# Plot unstacked bar chart
plt.figure(figsize=(22, 16))
for cancellation_policy in pt_cp_df['cancellation_policy'].unique():
subset = pt_cp_df[pt_cp_df['cancellation_policy'] == cancellation_policy]
plt.bar(subset['property_type'], subset['cancellation_policy_count'], label=cancellation_policy)
plt.title('Counts of Property Types by cancellation_policy')
plt.xlabel('Property Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='cancellation Policy')
plt.show()
plot_barplot(df, 'property_type','host_tenure')
<AxesSubplot: title={'center': 'Barplot of property_type'}, xlabel='property_type', ylabel='Count'>
plot_barplot(df, 'property_type','average_review_score')
<AxesSubplot: title={'center': 'Barplot of property_type'}, xlabel='property_type', ylabel='Count'>
plot_barplot(df, 'property_type','amenities_count')
<AxesSubplot: title={'center': 'Barplot of property_type'}, xlabel='property_type', ylabel='Count'>
plot_barplot(df, 'room_type','amenities_count')
<AxesSubplot: title={'center': 'Barplot of room_type'}, xlabel='room_type', ylabel='Count'>
plot_barplot(df, 'room_type','average_review_score')
<AxesSubplot: title={'center': 'Barplot of room_type'}, xlabel='room_type', ylabel='Count'>
plot_barplot(df, 'room_type','host_tenure')
<AxesSubplot: title={'center': 'Barplot of room_type'}, xlabel='room_type', ylabel='Count'>
plot_barplot(df, 'room_type','host_tenure')
<AxesSubplot: title={'center': 'Barplot of room_type'}, xlabel='room_type', ylabel='Count'>
Here iam not performing any text preprocessing , because it is oneshot sentiment analysis if i done any stopwords remvoal or lemmatization or stemming it may be give less acurrcy hence iam skipping the part of preprocessing here
# Download VADER lexicon (if not already downloaded)
nltk.download('vader_lexicon')
def get_sentiment_category(text):
"""
Analyzes the sentiment of the given text and returns the sentiment category.
Parameters:
- text (str): The text to analyze for sentiment.
Returns:
- str: The sentiment category ('Positive', 'Negative', or 'Neutral').
"""
# Create sentiment analyzer
analyzer = SentimentIntensityAnalyzer()
# Get sentiment scores
sentiment_scores = analyzer.polarity_scores(text)
# Determine sentiment category based on the compound score
compound_score = sentiment_scores['compound']
if compound_score >= 0.05:
sentiment_category = 'Positive'
elif compound_score <= -0.05:
sentiment_category = 'Negative'
else:
sentiment_category = 'Neutral'
return sentiment_category
[nltk_data] Downloading package vader_lexicon to [nltk_data] C:\Users\91845\AppData\Roaming\nltk_data... [nltk_data] Package vader_lexicon is already up-to-date!
# df['sentiment_category'] = df['description'].apply(get_sentiment_category)
df_senti = pd.read_csv("intermediate.csv")
senti_dummy = pd.get_dummies(df['sentiment_category'],prefix='senti_',dtype=int)
# df.to_csv("intermediate.csv",index=False)
create_seaborn_countplot(df_senti, 'sentiment_category')
pos_property_type = df_senti[df_senti['sentiment_category']=='Positive']['property_type'].value_counts()[:4].to_frame().reset_index()
plot_barplot(pos_property_type, 'property_type', 'count', title='"Most postive comments among properties"')
neg_property_type = df_senti[df_senti['sentiment_category']=='Negative']['property_type'].value_counts()[:4].to_frame().reset_index()
plot_barplot(neg_property_type, 'property_type', 'index', title='"Most negtive comments among properties"')
<AxesSubplot: title={'center': '"Most negtive comments among properties"'}, xlabel='property_type', ylabel='Count'>
neu_property_type = df_senti[df_senti['sentiment_category']=='Neutral']['property_type'].value_counts()[:4].to_frame().reset_index()
plot_barplot(neu_property_type, 'property_type', 'index', title='"Most Neutral comments among properties"')
<AxesSubplot: title={'center': '"Most Neutral comments among properties"'}, xlabel='property_type', ylabel='Count'>
plt.figure(figsize=(23, 16))
sns.countplot(x='room_type', data=df_senti, palette='viridis', hue='sentiment_category')
plt.title('Rooms Type with thier reviews')
plt.show()
df_senti.columns
df.columns
Index(['id', 'log_price', 'property_type', 'room_type', 'amenities',
'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy',
'cleaning_fee', 'city', 'description', 'first_review',
'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
'host_since', 'instant_bookable', 'last_review', 'latitude',
'longitude', 'name', 'neighbourhood', 'number_of_reviews',
'review_scores_rating', 'thumbnail_url', 'zipcode', 'bedrooms', 'beds',
'amenities_count', 'has_wireless_internet', 'has_kitchen',
'has_heating', 'has_essentials', 'has_smoke_detector',
'review_duration', 'time_since_last_review', 'host_tenure',
'average_review_score', 'sentiment_category'],
dtype='object')
import plotly.express as px
fig = px.scatter_mapbox(df, lat="latitude", lon="longitude", color="log_price", size="log_price",
color_continuous_scale=px.colors.sequential.Viridis, size_max=25, zoom=10)
fig.update_layout(mapbox_style="carto-positron", title="Price Distribution Map")
fig.show()
This map will show the spatial distribution of listing prices. Areas with larger marker sizes indicate higher priced listings.
neighborhood_prices = df.groupby('neighbourhood')['log_price'].mean().reset_index()
fig = px.bar(neighborhood_prices, x='neighbourhood', y='log_price', title='Average Price by Neighborhood')
fig.show()
This bar plot shows the average listing prices for each neighborhood, allowing you to identify neighborhoods with higher or lower prices.
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5) # Adjust number of clusters as needed
df['cluster'] = kmeans.fit_predict(df[['latitude', 'longitude']])
fig = px.scatter_mapbox(df, lat="latitude", lon="longitude", color="cluster",
color_continuous_scale=px.colors.sequential.Viridis, zoom=10)
fig.update_layout(mapbox_style="carto-positron", title="Spatial Clustering of Listings")
fig.show()
This map shows spatial clusters of listings with similar pricing patterns, helping identify areas with distinct pricing characteristics.
import plotly.express as px
# Define city coordinates
city_coordinates = {
'NYC': {'latitude': 40.7128, 'longitude': -74.0060},
'SF': {'latitude': 37.7749, 'longitude': -122.4194},
'DC': {'latitude': 38.9072, 'longitude': -77.0369},
'LA': {'latitude': 34.0522, 'longitude': -118.2437},
'Chicago': {'latitude': 41.8781, 'longitude': -87.6298},
'Boston': {'latitude': 42.3601, 'longitude': -71.0589}
}
# Plot density map for each city
for city, coordinates in city_coordinates.items():
fig = px.density_mapbox(df, lat='latitude', lon='longitude', z='log_price', radius=10,
center=dict(lat=coordinates['latitude'], lon=coordinates['longitude']), zoom=10,
mapbox_style="carto-positron", title=f"Density Map of Listing Prices in {city}")
fig.show()
This heatmap visualizes the density of listing prices, with areas of higher density indicating regions with higher or lower prices.
import plotly.express as px
# Create an interactive map
fig = px.scatter_mapbox(df, lat='latitude', lon='longitude', color='log_price', hover_name='neighbourhood',
hover_data=['log_price', 'property_type', 'room_type'],
color_continuous_scale=px.colors.sequential.Viridis, zoom=10)
# Update map layout
fig.update_layout(mapbox_style="carto-positron", title='Interactive Map of Listing Prices', margin={"r":0,"t":0,"l":0,"b":0})
# Show the interactive map
fig.show()
from geopy.distance import great_circle
# Define city center coordinates
city_coordinates = {
'NYC': {'latitude': 40.7128, 'longitude': -74.0060},
'SF': {'latitude': 37.7749, 'longitude': -122.4194},
'DC': {'latitude': 38.9072, 'longitude': -77.0369},
'LA': {'latitude': 34.0522, 'longitude': -118.2437},
'Chicago': {'latitude': 41.8781, 'longitude': -87.6298},
'Boston': {'latitude': 42.3601, 'longitude': -71.0589}
}
# Function to calculate distance to city center
def calculate_distance(row):
city = row['city']
listing_coordinates = (row['latitude'], row['longitude'])
city_center = (city_coordinates[city]['latitude'], city_coordinates[city]['longitude'])
return great_circle(city_center, listing_coordinates).miles
# Apply the function to create the distance_to_city_center column
df['distance_to_city_center'] = df.apply(calculate_distance, axis=1)
# Plot scatter map
fig = px.scatter_mapbox(df, lat="latitude", lon="longitude", color="distance_to_city_center",
color_continuous_scale=px.colors.sequential.Viridis, zoom=10,
mapbox_style="carto-positron", title="Proximity to City Centers")
fig.show()
This map shows listings color-coded by their distance from the city center, helping you identify areas closer or farther from the city center.
!jupyter nbconvert --to html HomeStay_EDA.ipynb